Import the Pandas library
In [1]:
import pandas as pd
Load the train and test datasets to create two DataFrames
In [2]:
train_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv"
train = pd.read_csv(train_url)
test_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv"
test = pd.read_csv(test_url)
Print the 'head' of the train and test dataframes
In [3]:
print(train.head())
print(test.head())
In [4]:
print(train.shape)
print(test.shape)
In [5]:
print(train.describe())
print(test.describe())
Passengers that survived vs passengers that passed away
In [6]:
print(train["Survived"].value_counts())
As proportions
In [7]:
print(train["Survived"].value_counts(normalize=True))
Males that survived vs males that passed away
In [8]:
print(train["Survived"][train["Sex"] == 'male'].value_counts())
Females that survived vs Females that passed away
In [9]:
print(train["Survived"][train["Sex"] == 'female'].value_counts())
Normalized male survival
In [10]:
print(train["Survived"][train["Sex"] == 'male'].value_counts(normalize=True))
Normalized female survival
In [11]:
print(train["Survived"][train["Sex"] == 'female'].value_counts(normalize=True))
Create the column Child and assign to 'NaN'
In [12]:
train["Child"] = float('NaN')
Assign 1 to passengers under 18, 0 to those 18 or older. Print the new column.
In [13]:
train["Child"][train["Age"] < 18] = 1
train["Child"][train["Age"] >= 18] = 0
print(train['Child'])
Print normalized Survival Rates for passengers under 18
In [14]:
print(train["Survived"][train["Child"] == 1].value_counts(normalize = True))
Print normalized Survival Rates for passengers 18 or older
In [15]:
print(train["Survived"][train["Child"] == 0].value_counts(normalize = True))
Create a copy of test: test_one
In [16]:
test_one = test
Initialize a Survived column to 0
In [17]:
test_one['Survived'] = 0
Set Survived to 1 if Sex equals "female" and print the Survived column from test_one
In [18]:
test_one['Survived'][test_one['Sex'] == "female"] = 1
print(test_one['Survived'])
Convert the male and female groups to integer form
In [19]:
train["Sex"][train["Sex"] == "male"] = 0
train["Sex"][train["Sex"] == "female"] = 1
test["Sex"][test["Sex"] == "male"] = 0
test["Sex"][test["Sex"] == "female"] = 1
Impute the Embarked variable
In [20]:
train["Embarked"] = train["Embarked"].fillna('S')
test["Embarked"] = test["Embarked"].fillna('S')
Convert the Embarked classes to integer form
In [21]:
train["Embarked"][train["Embarked"] == "S"] = 0
train["Embarked"][train["Embarked"] == "C"] = 1
train["Embarked"][train["Embarked"] == "Q"] = 2
test["Embarked"][test["Embarked"] == "S"] = 0
test["Embarked"][test["Embarked"] == "C"] = 1
test["Embarked"][test["Embarked"] == "Q"] = 2
Print the Sex and Embarked columns
In [22]:
print(train["Embarked"])
print(train["Sex"])
In [23]:
print(test["Embarked"])
print(test["Sex"])
Import the Numpy library
In [24]:
import numpy as np
Import 'tree' from scikit-learn library
In [25]:
from sklearn import tree
Print the train data to see the available features
In [26]:
print(train)
Fill the NaN values
In [27]:
train[["Pclass", "Sex", "Age", "Fare"]] = train[["Pclass", "Sex", "Age", "Fare"]].fillna(train[["Pclass", "Sex", "Age", "Fare"]].median())
print(train)
Create the target and features numpy arrays: target, features_one
In [28]:
target = train["Survived"].values
features_one = train[["Pclass", "Sex", "Age", "Fare"]].values
Fit your first decision tree: my_tree_one
In [29]:
my_tree_one = tree.DecisionTreeClassifier()
my_tree_one = my_tree_one.fit(features_one, target)
Look at the importance and score of the included features
In [30]:
print(my_tree_one.feature_importances_)
print(my_tree_one.score(features_one, target))
Impute the missing value with the median
In [31]:
#test.Fare[152] = test.Fare.median()
test[["Pclass", "Sex", "Age", "Fare"]] = test[["Pclass", "Sex", "Age", "Fare"]].fillna(test[["Pclass", "Sex", "Age", "Fare"]].median())
Extract the features from the test set: Pclass, Sex, Age, and Fare.
In [32]:
test_features = test[["Pclass", "Sex", "Age", "Fare"]].values
Make your prediction using the test set
In [33]:
first_prediction = my_tree_one.predict(test_features)
print(first_prediction)
Create a data frame with two columns: PassengerId & Survived. Survived contains your predictions
In [34]:
PassengerId =np.array(test["PassengerId"]).astype(int)
print(PassengerId.shape)
first_solution = pd.DataFrame(first_prediction, PassengerId, columns = ["Survived"])
print(first_solution)
Check that your data frame has 418 entries
In [35]:
print(first_solution.shape)
Write your solution to a csv file with the name my_solution.csv
In [36]:
first_solution.to_csv("../submissions/first_solution.csv", index_label = ["PassengerId"])
Create a new array with the added features: features_two
In [37]:
features_two = train[["Pclass","Age","Sex","Fare", "SibSp", "Parch", "Embarked"]].values
Control overfitting by setting "max_depth" to 10 and "min_samples_split" to 5 : my_tree_two
In [38]:
max_depth = 10
min_samples_split = 5
my_tree_two = tree.DecisionTreeClassifier(max_depth = 10, min_samples_split = 5, random_state = 1)
my_tree_two = my_tree_two.fit(features_two, target)
Print the score of the new decison tree
In [39]:
print(my_tree_two.score(features_two, target))
In [40]:
test_features_two = test[["Pclass","Age","Sex","Fare", "SibSp", "Parch", "Embarked"]].values
In [41]:
second_prediction = my_tree_two.predict(test_features_two)
print(second_prediction)
print(second_prediction.shape)
In [42]:
#PassengerId =np.array(test["PassengerId"]).astype(int)
second_solution = pd.DataFrame(second_prediction, PassengerId, columns = ["Survived"])
print(second_solution)
In [43]:
print(second_solution.shape)
In [44]:
second_solution.to_csv("../submissions/second_solution.csv", index_label = ["PassengerId"])
In [45]:
# Create train_two with the newly defined feature
In [46]:
train_two = train.copy()
train_two["family_size"] = train_two["SibSp"] + train_two["Parch"] + 1
In [47]:
# Create a new feature set and add the new feature
In [48]:
features_three = train_two[["Pclass", "Sex", "Age", "Fare", "SibSp", "Parch", "family_size"]].values
In [49]:
# Define the tree classifier, then fit the model
In [50]:
my_tree_three = tree.DecisionTreeClassifier()
my_tree_three = my_tree_three.fit(features_three, target)
In [51]:
# Print the score of this decision tree
In [52]:
print(my_tree_three.score(features_three, target))
In [ ]:
Import the RandomForestClassifier
In [53]:
from sklearn.ensemble import RandomForestClassifier
We want the Pclass, Age, Sex, Fare,SibSp, Parch, and Embarked variables
In [54]:
features_forest = train[["Pclass", "Age", "Sex", "Fare", "SibSp", "Parch", "Embarked"]].values
target = train["Survived"]
Building and fitting my_forest
In [55]:
forest = RandomForestClassifier(max_depth = 10, min_samples_split=2, n_estimators = 100, random_state = 1)
my_forest = forest.fit(features_forest, target)
Print the score of the fitted random forest
In [56]:
print(my_forest.score(features_forest, target))
Compute predictions on our test set features then print the length of the prediction vector
In [57]:
test_features = test[["Pclass", "Age", "Sex", "Fare", "SibSp", "Parch", "Embarked"]].values
pred_forest = my_forest.predict(test_features)
print(len(pred_forest))
In [58]:
PassengerId =np.array(test["PassengerId"]).astype(int)
third_solution = pd.DataFrame(pred_forest, PassengerId, columns = ["Survived"])
print(third_solution)
In [59]:
print(third_solution.shape)
In [60]:
third_solution.to_csv("../submissions/third_solution.csv", index_label = ["PassengerId"])
Request and print the .feature_importances_ attribute
In [61]:
print(my_tree_two.feature_importances_)
print(my_forest.feature_importances_)
Compute and print the mean accuracy score for both models
In [62]:
print(my_tree_two.score(features_two, target))
print(my_forest.score(features_forest, target))
Florent Amato